import warnings
warnings.filterwarnings("ignore")
The following codes were taken from the examples given by the instructor. We did not make any changes, unless specified.
from keras.models import Model, Sequential, load_model
from keras.layers import Conv2D, Input
import keras.backend as K
LAMBDA=1
def l2_loss(x):
return K.sum(K.square(x)) / 2
class EncoderDecoder:
def __init__(self, input_shape=(256, 256, 3), target_layer=2,
decoder_path=None):
self.input_shape = input_shape
self.target_layer = target_layer
self.encoder = VGG19(input_shape=input_shape, target_layer=target_layer)
if decoder_path:
self.decoder = load_model(decoder_path)
else:
self.decoder = self.create_decoder(target_layer)
self.model = Sequential()
self.model.add(self.encoder)
self.model.add(self.decoder)
self.loss = self.create_loss_fn(self.encoder)
self.model.compile('adam', self.loss)
def create_loss_fn(self, encoder):
def get_encodings(inputs):
encoder = VGG19(inputs, self.input_shape, self.target_layer)
return encoder.output
def loss(img_in, img_out):
encoding_in = get_encodings(img_in)
encoding_out = get_encodings(img_out)
return l2_loss(img_out - img_in) + \
LAMBDA*l2_loss(encoding_out - encoding_in)
return loss
def create_decoder(self, target_layer):
inputs = Input(shape=self.encoder.output_shape[1:])
layers = decoder_layers(inputs, target_layer)
output = Conv2D(3, (3, 3), activation='relu', padding='same',
name='decoder_out')(layers)
return Model(inputs, output, name='decoder_%s' % target_layer)
def export_decoder(self):
self.decoder.save('decoder_%s.h5' % self.target_layer)
from keras.models import Model
from keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D, Input
from keras.utils.data_utils import get_file
import keras.backend as K
import h5py
import numpy as np
import tensorflow as tf
WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5'
MEAN_PIXEL = np.array([103.939, 116.779, 123.68])
WEIGHTS_PATH = get_file('vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5',
WEIGHTS_PATH_NO_TOP,
cache_subdir='models',
file_hash='253f8cb515780f3b799900260a226db6')
def vgg_layers(inputs, target_layer):
# Block 1
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv1')(inputs)
if target_layer == 1:
return x
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
# Block 2
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
if target_layer == 2:
return x
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
# Block 3
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
if target_layer == 3:
return x
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv4')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
# Block 4
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
if target_layer == 4:
return x
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv4')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
# Block 5
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
return x
def load_weights(model):
f = h5py.File(WEIGHTS_PATH)
layer_names = [name for name in f.attrs['layer_names']]
for layer in model.layers:
b_name = layer.name.encode()
if b_name in layer_names:
g = f[b_name]
weights = [g[name] for name in g.attrs['weight_names']]
layer.set_weights(weights)
layer.trainable = False
f.close()
def VGG19(input_tensor=None, input_shape=None, target_layer=1):
"""
VGG19, up to the target layer (1 for relu1_1, 2 for relu2_1, etc.)
"""
if input_tensor is None:
inputs = Input(shape=input_shape)
else:
inputs = Input(tensor=input_tensor, shape=input_shape)
model = Model(inputs, vgg_layers(inputs, target_layer), name='vgg19')
load_weights(model)
return model
def preprocess_input(x):
# Convert 'RGB' -> 'BGR'
if type(x) is np.ndarray:
x = x[..., ::-1]
else:
x = tf.reverse(x, [-1])
return x - MEAN_PIXEL
from keras.layers import Input, Conv2D, UpSampling2D, Conv2DTranspose
def decoder_layers(inputs, layer):
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='decoder_block5_conv1')(inputs)
if layer == 1:
return x
x = Conv2DTranspose(64, (4,4), strides=2, activation='relu', padding='same', name='decoder_block4_upsample')(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='decoder_block4_conv4')(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='decoder_block4_conv3')(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='decoder_block4_conv2')(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='decoder_block4_conv1')(x)
if layer == 2:
return x
x = Conv2DTranspose(128, (4,4), strides=2, activation='relu', padding='same', name='decoder_block3_upsample')(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='decoder_block3_conv4')(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='decoder_block3_conv3')(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='decoder_block3_conv2')(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='decoder_block3_conv1')(x)
if layer == 3:
return x
x = Conv2DTranspose(256, (4,4), strides=2, activation='relu', padding='same', name='decoder_block2_upsample')(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='decoder_block2_conv2')(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='decoder_block2_conv1')(x)
if layer == 4:
return x
x = Conv2DTranspose(512, (4,4), strides=2, activation='relu', padding='same', name='decoder_block1_upsample')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='decoder_block1_conv2')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='decoder_block1_conv1')(x)
if layer == 5:
return x
We decided to us the COCO dataset (https://cocodataset.org/) that was referenced in the paper (https://arxiv.org/abs/1705.08086). However, in the interest of time, we only trained on a subset of the full dataset. We randomly picked 10,000 images for training our decoder and 2,000 images for validation.
The code in the model.py (method create_loss_fn) already incorporates the combine image loss and loss statistics into the overall loss function. We verified that it is inedeed using this loss function.
The training was done on a gpgpu-1 node on M2. The training time was resplectively:
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
from keras.callbacks import Callback
import numpy as np
import pandas as pd
from PIL import Image
from model import EncoderDecoder
from util import count_num_samples
PATH = 'data/'
TARGET_SIZE = (256, 256)
BATCH_SIZE = 64 # Increased batch size to make the training less computational
epochs = 20 # Increased number of epochs
def create_gen(img_dir, target_size, batch_size, clas):
datagen = ImageDataGenerator()
gen = datagen.flow_from_directory(img_dir, classes=[clas], target_size=target_size,
batch_size=batch_size, class_mode=None)
def tuple_gen():
for img in gen:
if img.shape[0] != batch_size:
continue
yield (img, img)
return tuple_gen()
gen_train = create_gen(PATH, TARGET_SIZE, BATCH_SIZE, 'train') # generator for training
gen_val = create_gen(PATH, TARGET_SIZE, BATCH_SIZE,'val') #generatr for validation
num_samples = count_num_samples(PATH)
steps_per_epoch = num_samples // BATCH_SIZE
target_layer = int(sys.argv[1])
encoder_decoder = EncoderDecoder(target_layer=target_layer)
history = encoder_decoder.model.fit_generator(gen_train, steps_per_epoch=steps_per_epoch,
epochs=epochs, validation_data=gen_val, validation_steps=9)
encoder_decoder.export_decoder() #save model
hist_df = pd.DataFrame(history.history)
hist_df.to_pickle('train_hist_'+str(target_layer)+'.pkl') #save models losses
As mentioned we trained our model using 10,000 images and validated using 2000. We trained for 20 epochs with a batch size of 64. We trained two decoders, one using block two and one using block 4. The results of the training are shown below.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
directory = '/mnt/c/Users/trozz/Desktop'
layers_2 = pd.read_pickle(directory+'/train_hist_2.pkl')
layers_4 = pd.read_pickle(directory+'/train_hist_4.pkl')
plt.figure()
fig, axs = plt.subplots(ncols=2, constrained_layout=True, figsize=(17,7))
sns.lineplot(ax=axs[0], data=layers_2)
sns.lineplot(ax=axs[1], data=layers_4)
plt.suptitle('Loss vs Epochs')
for i in [0,1]:
axs[i].set_xlabel('Epochs')
axs[i].set_ylabel('Loss')
axs[i].set_xticks(np.linspace(1,20,20, dtype=int))
axs[0].set_title('Layer 2')
axs[1].set_title('Layer 4')
sns.despine()
plt.show()
plt.figure()
fig, axs = plt.subplots(ncols=2, constrained_layout=True, figsize=(17,7))
sns.lineplot(ax=axs[0], data=np.log(layers_2))
sns.lineplot(ax=axs[1], data=np.log(layers_4))
plt.suptitle('Log Loss vs Epochs')
for i in [0,1]:
axs[i].set_xlabel('Epoch')
axs[i].set_ylabel('Log loss')
axs[i].set_xticks(np.linspace(1,20,20, dtype=int))
axs[0].set_title('Layer 2')
axs[1].set_title('Layer 4')
sns.despine()
plt.show()
# To obtain a reconstruction of the images we use the provided evaluate-decorer.py script, printed below.
from keras.preprocessing import image
import numpy as np
from scipy.misc import imsave, imshow
DECODER_PATH = '/mnt/c/Users/trozz/Desktop/decoder_2.h5'
img_names = ['deadman.jpg', 'mosaic.jpg', 'pencil.jpg']
models = [2, 4]
plt.figure()
fig, axs = plt.subplots(ncols=len(img_names), constrained_layout=True, figsize=(15,5))
for name in range(len(img_names)):
INPUT_IMG_PATH = '/mnt/c/Users/trozz/Desktop/images/'+img_names[name]
encoder_decoder = EncoderDecoder(decoder_path=DECODER_PATH, target_layer=2)
input_img = image.load_img(INPUT_IMG_PATH, target_size=(256, 256))
input_img = image.img_to_array(input_img)
input_img = np.expand_dims(input_img, axis=0)
output_img = encoder_decoder.model.predict([input_img])[0]
axs[name].imshow(np.uint8(output_img))
plt.show()
DECODER_PATH = '/mnt/c/Users/trozz/Desktop/decoder_4.h5'
img_names = ['deadman.jpg', 'mosaic.jpg', 'pencil.jpg']
models = [2, 4]
plt.figure()
fig, axs = plt.subplots(ncols=len(img_names), constrained_layout=True, figsize=(15,5))
for name in range(len(img_names)):
INPUT_IMG_PATH = '/mnt/c/Users/trozz/Desktop/images/'+img_names[name]
encoder_decoder = EncoderDecoder(decoder_path=DECODER_PATH, target_layer=4)
input_img = image.load_img(INPUT_IMG_PATH, target_size=(256, 256))
input_img = image.img_to_array(input_img)
input_img = np.expand_dims(input_img, axis=0)
output_img = encoder_decoder.model.predict([input_img])[0]
axs[name].imshow(np.uint8(output_img))
plt.show()
As we can see the reconstruction of the images with layer 2 are highly satisfactory. On the other hand, the recontruction done with the model up to block 4 shows defects in color reconstruction and a grid artifact. To the best of our knowledge this would have been caused by a mismatch between stride and kernel size in the Conv2Dtranspose layers of our decoder. This is not the case in our model (stride 2, kernel (4,4)), we can not therefore explain the origin of these grid like patterns. It might be caused by a lack of training of this block. From the losses we observed that this model reached convergence after one epoch with very high values, suggesting a lack of learning of the layers when 4 blocks are used in the encoder and decoder.
Due to this lackluster recontruction and the considerably high losses, we decided to apply the pretrained models for the style transfer portion of this assignment.
Given the issues with training, we decided to use the pre-trained model presented in class for the style transfer portion of this lab. There is some custom code, but most of it comes from the notebooks presented in class. The code was fairly well commented, but we did fortify it to show our understanding.
from pathlib import PurePath
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# Utility to show images
def show_images_in_a_row(images, titles):
fig=plt.figure(figsize=(20, 10))
num_images = len(images)
for index in range(num_images):
plt.subplot(1,num_images,index+1)
if len(images[index].shape) == 4:
plt.imshow(images[index][0,:,:,:])
else:
plt.imshow(images[index])
plt.title(titles[index])
# Utility to show layer images
def show_images_of_a_layer(decoded_images, layer):
name = 'layer{}'.format(layer)
show_images_in_a_row([decoded_images[name][0], decoded_images[name][1]],
['Layer {} - Styled'.format(layer), 'Layer {} - Reconstructed'.format(layer)])
# Utility to show a bunch of images
def show_images_in_a_block(images):
total = len(images)
columns = 2
rows = int(total/columns)
rows += 0 if total%columns == 0 else 1
processed = 0
itor = iter(images)
fig=plt.figure(figsize=(10, 180))
for i in range(1, columns*rows +1):
if (processed < total):
ax = fig.add_subplot(rows, columns, i)
(alpha, image) = next(itor)
ax.set_title(alpha)
plt.imshow(image)
processed += 1
plt.show()
# Code obtained from https://github.com/8000net/LectureNotesMaster/blob/master/03c%20UniversalStyleTransfer.ipynb
#
# Comments have been fortified by the authoors of this notebook
def load_img(path_to_img):
"""Loads the image (acceptable formats are BMP, GIF, JPEG, PNG) identified by path_to_img, and returns it
as a tensor of maximum size 512x512. If rescaling of the image is necessary, the original scale is
maintained. This is assumed to be a RBG images"""
max_dim = 512
# Loads the image into memory as a tensor with floating point percision
img = tf.io.read_file(path_to_img)
img = tf.image.decode_image(img, channels=3)
img = tf.image.convert_image_dtype(img, tf.float32)
# Determine the scale of the imput image in terms of the maximum allowed lengt (or width)
shape = tf.cast(tf.shape(img)[:-1], tf.float32)
long_dim = max(shape)
scale = max_dim / long_dim
# Reshape, if necessary
new_shape = tf.cast(shape * scale, tf.int32)
img = tf.image.resize(img, new_shape)
img = img[tf.newaxis, :]
return img
# Code obtained from https://github.com/8000net/LectureNotesMaster/blob/master/03c%20UniversalStyleTransfer.ipynb
#
# Comments have been fortified by the authoors of this notebook
class VGG19AutoEncoder(tf.keras.Model):
"""This is a Keras model of a VGG auto-encoder. It's a pre-trained model. As such, you must specify where
that pretrained data resides. This is the the files_path in the class constructor. It expects a model for
for each of the first three blocks of VGG19 (unfortunatley, the terminologly of block and layer is mixed).
Each model is comporsed of an encoder, a decoder, and a convolutional layer (purpose is to render the image
viewable). As suched, the loaded model is comprised of three layers: 0: encoder, 1: decoder; 2: output
convolution. The exception is the model for VGG Block 1 as it does not need a deocder since there is no
down-sampling. Therefore it is expected to be of the form encoder (model.layer[0]) and oputput convolution
When you "call" this model, you must pass it a dictionary of the format {'style':<style_image>,
'content':<content_image> where the images are tensors It will then perform a whitening and coloring
transformation on each of the aforementioned blocks starting at the bottom and working up. This has the
effect of recursively applying the same style to the content image, i.e. the content image keeps changing. Each
of these transformatons are stored in returned dictionary which is of the form
{'layer#' : (<blended_image>, <reconstructed_image>)} by executing the aforementioned encoder, decoder, and
output convolution. You may also pass it a dictionary of the form {'layer#':#.#} to adjust the amount of style
content that is blended."""
def __init__(self, files_path):
super(VGG19AutoEncoder, self).__init__()
# Load Full Model with every trained decoder
# Each model has an encoder, a decoder, and an extra output convolution
# that converts the upsampled activations into output images
# DO NOT load models four and five because they are not great auto encoders
# and therefore will cause weird artifacts when used for style transfer
# VGG Block 3
ModelBlock3 = tf.keras.models.load_model(str(PurePath(files_path, 'Block3_Model')), compile = False)
self.E3 = ModelBlock3.layers[0] # VGG encoder
self.D3 = ModelBlock3.layers[1] # Trained decoder from VGG
self.O3 = ModelBlock3.layers[2] # Conv layer to get to three channels, RGB image
# VGG Block 2
ModelBlock2 = tf.keras.models.load_model(str(PurePath(files_path, 'Block2_Model')), compile = False)
self.E2 = ModelBlock2.layers[0] # VGG encoder
self.D2 = ModelBlock2.layers[1] # Trained decoder from VGG
self.O2 = ModelBlock2.layers[2] # Conv layer to get to three channels, RGB image
# VGG Block 1
# Note: No special decoder for this one becasue VGG first layer has no downsampling. So the decoder
# is just a convolution
ModelBlock1 = tf.keras.models.load_model(str(PurePath(files_path, 'Block1_Model')), compile = False)
self.E1 = ModelBlock1.layers[0] # VGG encoder, one layer
self.O1 = ModelBlock1.layers[1] # Conv layer to get to three channels, RGB image
def callReverse(self, image, alphas=None, training=False, reverse=False):
"""Same as call(), but operated in the opposite order os style transfer, i.e. shallow to deep"""
style_image = image['style']
content_image = image['content']
x = content_image
# this will be the output, where each value is a styled version of the image at layer 1, 2, and 3.
# So each key in the dictionary corresponds to layer1, layer2, and layer3. We also give back the
# reconstructed image from the auto encoder so each value in the dict is a tuple (styled, reconstructed)
output_dict = dict()
# choose covariance function. It is more stable, but signal will work for very small images. It is
# also significanly less computationaly intensive
wct = self.wct_from_cov
if alphas==None:
alphas = {'layer3':0.6,
'layer2':0.6,
'layer1':0.6}
# For each of the following blocks, the operation it the same. We retrieve the activations of the
# last layer of the respective VGG19 block for both the content image and the style image by passing
# it through the encoder. Note the content image is changing. It starts out as the raw image input
# by the caller, but is carried through as styling occurs such that it is progressively refined. We
# then perform the whitening-coloring transformation. The resulting image is passed thropugh the
# decoder and output convolution, and we have our stylized image. In a similar manner (decode,
# convolve), the original image is reconstruct, but of course it is the original image used, not the
# WCT image. Note that we enhance the contrast at each stage to remove some artifacts noticed.
# ------Layer 1----------
# apply whiten/color on layer 1 from the already blended image
# get activations
a_c = self.E1(tf.constant(x))
a_s = self.E1(tf.constant(style_image))
# swap grammian of activations, blended with original
x = wct(a_c.numpy(),a_s.numpy(), alpha=alphas['layer1'])
# decode the new style
x = self.O1(x)
x = self.enhance_contrast(x,1.2)
# get reconstruction
reconst1 = self.O1(self.E1(tf.constant(content_image)))
# save off the styled and reconstructed images for display
blended1 = tf.clip_by_value(tf.squeeze(x), 0, 1)
reconst1 = tf.clip_by_value(tf.squeeze(reconst1), 0, 1)
output_dict['layer1'] = (blended1, reconst1)
# ------Layer 2----------
# apply whiten/color on layer 2 from the already blended image
# get activations
a_c = self.E2(tf.constant(x))
a_s = self.E2(tf.constant(style_image))
# swap grammian of activations, blended with original
x = wct(a_c.numpy(),a_s.numpy(), alpha=alphas['layer2'])
# decode the new style
x = self.O2(self.D2(x))
x = self.enhance_contrast(x,1.3)
# get reconstruction
reconst2 = self.O2(self.D2(self.E2(tf.constant(content_image))))
# save off the styled and reconstructed images for display
blended2 = tf.clip_by_value(tf.squeeze(x), 0, 1)
reconst2 = tf.clip_by_value(tf.squeeze(reconst2), 0, 1)
output_dict['layer2'] = (blended2, reconst2)
# ------Layer 3----------
# apply whiten/color on layer 3 from the original image
# get activations
a_c = self.E3(tf.constant(x))
a_s = self.E3(tf.constant(style_image))
# swap grammian of activations, blended with original
x = wct(a_c.numpy(),a_s.numpy(), alpha=alphas['layer3'])
# decode the new style
x = self.O3(self.D3(x))
x = self.enhance_contrast(x)
# get reconstruction
reconst3 = self.O3(self.D3(self.E3(tf.constant(content_image))))
# save off the styled and reconstructed images for display
blended3 = tf.clip_by_value(tf.squeeze(x), 0, 1)
reconst3 = tf.clip_by_value(tf.squeeze(reconst3), 0, 1)
output_dict['layer3'] = (blended3, reconst3)
return output_dict
def call(self, image, alphas=None, training=False, reverse=False):
""""This is the method that will perform the style transfer. We added a reverse flag which will perform
the transfer starting from block 1. The traditional method os to strt with the deeper block and move
upward. This input image variable is expected to be a dictionary of the for {'style':<style_image>,
'content':<content_image>} where the images are 4D tensors. The training variable is not used. The
aphas are a dictionary of the form {layer#':<alpha>} adn are used to control the percentae of style
transfered. As such, they should be a float of in the range (0,1). The amount of content transfered is
1-alpha. The #'s correspond directly to the block, so you may input a different value for each block"""
if reverse is True:
return self.callReverse(image, alphas=None)
style_image = image['style']
content_image = image['content']
x = content_image
# this will be the output, where each value is a styled version of the image at layer 1, 2, and 3.
# So each key in the dictionary corresponds to layer1, layer2, and layer3. We also give back the
# reconstructed image from the auto encoder so each value in the dict is a tuple (styled, reconstructed)
output_dict = dict()
# choose covariance function. It is more stable, but signal will work for very small images. It is
# also significanly less computationaly intensive
wct = self.wct_from_cov
if alphas==None:
alphas = {'layer3':0.6,
'layer2':0.6,
'layer1':0.6}
# For each of the following blocks, the operation it the same. We retrieve the activations of the
# last layer of the respective VGG19 block for both the content image and the style image by passing
# it through the encoder. Note the content image is changing. It starts out as the raw image input
# by the caller, but is carried through as styling occurs such that it is progressively refined. We
# then perform the whitening-coloring transformation. The resulting image is passed thropugh the
# decoder and output convolution, and we have our stylized image. In a similar manner (decode,
# convolve), the original image is reconstruct, but of course it is the original image used, not the
# WCT image. Note that we enhance the contrast at each stage to remove some artifacts noticed.
# ------Layer 3----------
# apply whiten/color on layer 3 from the original image
# get activations
a_c = self.E3(tf.constant(x))
a_s = self.E3(tf.constant(style_image))
# swap grammian of activations, blended with original
x = wct(a_c.numpy(),a_s.numpy(), alpha=alphas['layer3'])
# decode the new style
x = self.O3(self.D3(x))
x = self.enhance_contrast(x)
# get reconstruction
reconst3 = self.O3(self.D3(self.E3(tf.constant(content_image))))
# save off the styled and reconstructed images for display
blended3 = tf.clip_by_value(tf.squeeze(x), 0, 1)
reconst3 = tf.clip_by_value(tf.squeeze(reconst3), 0, 1)
output_dict['layer3'] = (blended3, reconst3)
# ------Layer 2----------
# apply whiten/color on layer 2 from the already blended image
# get activations
a_c = self.E2(tf.constant(x))
a_s = self.E2(tf.constant(style_image))
# swap grammian of activations, blended with original
x = wct(a_c.numpy(),a_s.numpy(), alpha=alphas['layer2'])
# decode the new style
x = self.O2(self.D2(x))
x = self.enhance_contrast(x,1.3)
# get reconstruction
reconst2 = self.O2(self.D2(self.E2(tf.constant(content_image))))
# save off the styled and reconstructed images for display
blended2 = tf.clip_by_value(tf.squeeze(x), 0, 1)
reconst2 = tf.clip_by_value(tf.squeeze(reconst2), 0, 1)
output_dict['layer2'] = (blended2, reconst2)
# ------Layer 1----------
# apply whiten/color on layer 1 from the already blended image
# get activations
a_c = self.E1(tf.constant(x))
a_s = self.E1(tf.constant(style_image))
# swap grammian of activations, blended with original
x = wct(a_c.numpy(),a_s.numpy(), alpha=alphas['layer1'])
# decode the new style
x = self.O1(x)
x = self.enhance_contrast(x,1.2)
# get reconstruction
reconst1 = self.O1(self.E1(tf.constant(content_image)))
# save off the styled and reconstructed images for display
blended1 = tf.clip_by_value(tf.squeeze(x), 0, 1)
reconst1 = tf.clip_by_value(tf.squeeze(reconst1), 0, 1)
output_dict['layer1'] = (blended1, reconst1)
return output_dict
@staticmethod
def enhance_contrast(image, factor=1.25):
"""Used to address some contrast issues identified during testing"""
return tf.image.adjust_contrast(image,factor)
@staticmethod
def wct_from_cov(content, style, alpha=0.6, eps=1e-5):
"""This method applies a whitening-coloring transforation to the input content and style images based
on https://arxiv.org/pdf/1705.08086.pdf. The input "images" are expected to be 4D tensors, and are the
activations of some layer of the network, not raw images. The alpha is an indication of how much style
should be applied to the transformation. It is expected to be a float in the range (0,1). The amount
of content applied, then becomes 1-alpha. eps is an adjustment factoer that is added to the singualar
values of the images before taking their square root (see below) to avoid a case of taking the square
root of zero (it probably shouldn't be an arguement, rather a local constant)
The general process of performing WCT is to first whiten the content using SVD. Instead of taking the SVD
of the activations, we take the SVD of the covariance. Since the covariance matrix is much smaller, it is
much less computationally intensive than the SVD of the activations, yet yields equivalent results.
Coloring involves a similar SVD, but on the style imnage. Now, the extracted style may be applied to the
previously whitened image as the blended rate specificed by the input alpha"""
'''
https://github.com/eridgd/WCT-TF/blob/master/ops.py
Perform Whiten-Color Transform on feature maps using numpy
See p.4 of the Universal Style Transfer paper for equations:
https://arxiv.org/pdf/1705.08086.pdf
'''
# 1xHxWxC -> CxHxW
content_t = np.transpose(np.squeeze(content), (2, 0, 1))
style_t = np.transpose(np.squeeze(style), (2, 0, 1))
# CxHxW -> CxH*W
content_flat = content_t.reshape(-1, content_t.shape[1]*content_t.shape[2])
style_flat = style_t.reshape(-1, style_t.shape[1]*style_t.shape[2])
# applt a threshold for only the largets eigen values
eigen_val_thresh = 1e-5
# ===Whitening transform===
# The steps are describe below. In general, we're following the formula to whiten the content
#
# Af = flatten(Ac) => A is the input activations. Here c stands for content, f for flattened
# Uc, Sc, Uc' = SVD(Af dot Af') => U is the left & right Eigen vectors; S is the extacted singualar values
# Ahat = Uc dot [sqrt(Sc)]^-1 dot Uc'
# Awc = Ahat dot Af => the whitened content
# =========================
# 1. take mean of each channel
mc = content_flat.mean(axis=1, keepdims=True)
fc = content_flat - mc
# 2. get covariance of content, take SVD
cov_c = np.dot(fc, fc.T) / (content_t.shape[1]*content_t.shape[2] - 1)
Uc, Sc, _ = np.linalg.svd(cov_c)
# 3. truncate the SVD to only the largest eigen values.
k_c = (Sc > eigen_val_thresh).sum()
Dc = np.diag((Sc[:k_c]+eps)**-0.5)
Uc = Uc[:,:k_c]
# 4. Now make a whitened content image
fc_white = (Uc @ Dc @ Uc.T) @ fc
# ===Coloring transform===
# The steps are describe below. In general, we're following the formula to color the content
# with the style
#
# Af = flatten(As) => A is the input activations. Here s stands for style, f for flattened
# Us, Ss, Us' = SVD(Af dot Af') => U is the left & right Eigen vectors; S is the extacted singular values
# Ahat = Us dot [sqrt(Ss)]^-1 dot Us'
# Acs = Ahat dot Awc => the extracted style applied to the above content
# =========================
# 1. take mean of each channel
ms = style_flat.mean(axis=1, keepdims=True)
fs = style_flat - ms
# 2. get covariance of style, take SVD
cov_s = np.dot(fs, fs.T) / (style_t.shape[1]*style_t.shape[2] - 1)
Us, Ss, _ = np.linalg.svd(cov_s)
# 3. truncate the SVD to only the largest eigen values
k_s = (Ss > eigen_val_thresh).sum()
Ds = np.sqrt(np.diag(Ss[:k_s]+eps))
Us = Us[:,:k_s]
# 4. Now make a colored image that mixes the Grammian of the style
# with the whitened content image
fcs_hat = (Us @ Ds @ Us.T) @ fc_white
fcs_hat = fcs_hat + ms # add style mean back to each channel
# Blend transform features with original features
blended = alpha*fcs_hat + (1 - alpha)*(content_flat)
# CxH*W -> CxHxW
blended = blended.reshape(content_t.shape)
# CxHxW -> 1xHxWxC
blended = np.expand_dims(np.transpose(blended, (1,2,0)), 0)
return np.float32(blended)
AE = VGG19AutoEncoder('models/vgg_decoder/')
Here, we apply a mosaic pattern to Dallas Hall. To start with, the color pallettes are fairly similar, but you can see some of the more vibrant colors of the mosaic transfered in to appropriate locations in the content image. Furthermore, you can see the patterns applied, primarily the circular patterns. This is one of the better transfers that we attempted. Content images with people or many buildings did not work very well.
Depending on the alpha chosen, the style transfer images could appear the same among the 3 different layers. However, we know they are different because of the test done.
content_image = load_img('images/deadman.jpg')
style_image = load_img('images/mosaic.jpg')
alphas = {'layer3':0.4, 'layer2':0.5, 'layer1':0.5}
%%time
style_and_content = {'style':style_image, 'content':content_image}
decoded_images = AE(style_and_content, alphas)
Original Images
show_images_in_a_row([style_image, content_image], ["Style - Mosaic", 'Content - Dallas Hall'])
Layer 1 (Final Output)
show_images_of_a_layer(decoded_images, 1)
Layer 2
show_images_of_a_layer(decoded_images, 2)
Layer 3
show_images_of_a_layer(decoded_images, 3)
# The top two layers look the same, check it....
print("layer1 == layer2: {}".format((decoded_images['layer1'][0].numpy() == decoded_images['layer2'][0].numpy).all()))
print("layer2 == layer3: {}".format((decoded_images['layer2'][0].numpy() == decoded_images['layer3'][0].numpy()).all()))
Here, we transfer a pencil drawing on to Dallas Hall. This one worked pretty well. The lines and curves in the photo are modified to look more like hand-drawn poencil lines, and the color is washed out. We do see some artifacts that look like blurs. We believe that this is related to mild shaows and whitespots in the pencil drawing. We feel that cropping the imager to more isolate the flower would have addressed this. Regardless, we fell that there is a lot of style in the transfered image
content_image = load_img('images/deadman.jpg')
style_image = load_img('images/pencil.jpg')
alphas = {'layer3':0.4, 'layer2':0.5, 'layer1':0.5}
%%time
style_and_content = {'style':style_image, 'content':content_image}
decoded_images = AE(style_and_content, alphas)
Original Images
show_images_in_a_row([style_image, content_image], ["Style - Pencil", 'Content - Dallas Hall'])
Layer 1
show_images_of_a_layer(decoded_images, 1)
Layer 2
show_images_of_a_layer(decoded_images, 2)
Layer 3
show_images_of_a_layer(decoded_images, 3)
# The top two layers look the same, check it....
print("layer1 == layer2: {}".format((decoded_images['layer1'][0].numpy() == decoded_images['layer2'][0].numpy).all()))
print("layer2 == layer3: {}".format((decoded_images['layer2'][0].numpy() == decoded_images['layer3'][0].numpy()).all()))
In class, we discussed starting the transfer deep in the network and working backward. The rationale was that it provided better images. We decided to take a look at that here. decoded_images
deep_to_shallow = decoded_images['layer1'][0]
decoded_images = AE(style_and_content, alphas=alphas, reverse=True)
shallow_to_deep = decoded_images['layer3'][0]
show_images_in_a_row([deep_to_shallow, shallow_to_deep],["L3->L1","L1->L3"])
Here we sysytematically change the alphas to see how blending differenent amounts of style and content affect the output. It was really more of an exercise to see what happens than anything else. The tile of each image shost the alpha (%style transfered). The first number is the 3rd layer transfer. The sencond number in the title is the 2nd layer transfer amount. The last number is the 1st layer transfer amount.
%%time
final_styling = []
for alpha1 in range(2, 10, 2):
for alpha2 in range(2, 10, 2):
for alpha3 in range(2, 10, 2):
alphas['layer3'] = alpha3/10
alphas['layer2'] = alpha2/10
alphas['layer1'] = alpha1/10
decoded_images = AE(style_and_content, alphas=alphas)
alpha = "{}/{}/{}".format(alphas['layer3'], alphas['layer2'], alphas['layer2'])
image = decoded_images['layer1'][0]
final_styling.append((alpha, image))
print("Gnerated {} styled images".format(len(final_styling)))
show_images_in_a_block(final_styling)
Below is a Keras model of a VGG auto-encoder. It's a pre-trained model. So where that pretrained data resides must be specified. In this model, the pretrained data is stored as weights and the weights_path in the class constructor. This pre-trained model is used for each of the five blocks (pooling layers) of VGG16. The format of each block is similar to the one from Chollet in Part 1. The two weights paths are loaded depending on whether to include the 3 fully-connected layers at the top of the network. When VGG16 is defined, we use "include_top=True" so this means that it is a series of convolutional layers followed by fully connected layers. Since we are doing this, we classify the dense layer with a block that uses softmax for the activation of classes. This VGG16 model uses an input layer followed by 5 blocks. The model is then saved to be used for the image masking. The average pooling operation was used instead of the max-pooling operation because the average improves the gradient flow and one obtains slightly more appealing visual results.
In order to allow both the VGG16 code and the masking code to work, we had to use Keras=2.2.0 and Tensorflow=1.10. The value 'theano' used in this code allows the environment variable KERAS_BACKEND to override what is defined in the config file.
#code comes from https://github.com/GongXinyuu/mask-neural-transfer/blob/master/vgg16featuremap.py
from __future__ import print_function
from __future__ import absolute_import
import warnings
from keras.models import Model
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Input
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import GlobalAveragePooling2D
from keras.layers import GlobalMaxPooling2D
from keras.engine.topology import get_source_inputs
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras import backend as K
from keras.applications.imagenet_utils import decode_predictions
from keras.applications.imagenet_utils import preprocess_input
from keras.applications.imagenet_utils import _obtain_input_shape
WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5'
WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'
def VGG16FM(include_top=True, weights='imagenet',
input_tensor=None, input_shape=None,
pooling=None,
classes=1000):
"""Instantiates the VGG16 architecture.
Optionally loads weights pre-trained
on ImageNet. Note that when using TensorFlow,
for best performance you should set
`image_data_format="channels_last"` in your Keras config
at ~/.keras/keras.json.
The model and the weights are compatible with both
TensorFlow and Theano. The data format
convention used by the model is the one
specified in your Keras config file.
# Arguments
include_top: whether to include the 3 fully-connected
layers at the top of the network.
weights: one of `None` (random initialization)
or "imagenet" (pre-training on ImageNet).
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
input_shape: optional shape tuple, only to be specified
if `include_top` is False (otherwise the input shape
has to be `(224, 224, 3)` (with `channels_last` data format)
or `(3, 224, 244)` (with `channels_first` data format).
It should have exactly 3 inputs channels,
and width and height should be no smaller than 48.
E.g. `(200, 200, 3)` would be one valid value.
pooling: Optional pooling mode for feature extraction
when `include_top` is `False`.
- `None` means that the output of the model will be
the 4D tensor output of the
last convolutional layer.
- `avg` means that global average pooling
will be applied to the output of the
last convolutional layer, and thus
the output of the model will be a 2D tensor.
- `max` means that global max pooling will
be applied.
classes: optional number of classes to classify images
into, only to be specified if `include_top` is True, and
if no `weights` argument is specified.
# Returns
A Keras model instance.
# Raises
ValueError: in case of invalid argument for `weights`,
or invalid input shape.
"""
if weights not in {'imagenet', None}:
raise ValueError('The `weights` argument should be either '
'`None` (random initialization) or `imagenet` '
'(pre-training on ImageNet).')
if weights == 'imagenet' and include_top and classes != 1000:
raise ValueError('If using `weights` as imagenet with `include_top`'
' as true, `classes` should be 1000')
# Determine proper input shape
input_shape = _obtain_input_shape(input_shape,
default_size=256,
min_size=48,
data_format=K.image_data_format(),
require_flatten=False)
if input_tensor is None:
img_input = Input(shape=input_shape)
else:
if not K.is_keras_tensor(input_tensor):
img_input = Input(tensor=input_tensor, shape=input_shape)
else:
img_input = input_tensor
# Block 1
x1 = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv1')(img_input)
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x1)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
# Block 2
x2 = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x2)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
# Block 3
x3 = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x3)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
# Block 4
x4 = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x4)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
# Block 5
x5 = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x5)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
if include_top:
# Classification block
x = Flatten(name='flatten')(x)
x = Dense(4096, activation='relu', name='fc1')(x)
x = Dense(4096, activation='relu', name='fc2')(x)
x = Dense(classes, activation='softmax', name='predictions')(x)
else:
if pooling == 'avg':
x = GlobalAveragePooling2D()(x)
elif pooling == 'max':
x = GlobalMaxPooling2D()(x)
# Ensure that the model takes into account
# any potential predecessors of `input_tensor`.
if input_tensor is not None:
inputs = get_source_inputs(input_tensor)
else:
inputs = img_input
# Create model.
model = Model(inputs, [x, x1, x2, x3, x4, x5], name='vgg16FM')
# load weights
if weights == 'imagenet':
if include_top:
weights_path = get_file('vgg16_weights_tf_dim_ordering_tf_kernels.h5',
WEIGHTS_PATH,
cache_subdir='models')
else:
weights_path = get_file('vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5',
WEIGHTS_PATH_NO_TOP,
cache_subdir='models')
model.load_weights(weights_path)
if K.backend() == 'theano':
layer_utils.convert_all_kernels_in_model(model)
if K.image_data_format() == 'channels_first':
if include_top:
maxpool = model.get_layer(name='block5_pool')
shape = maxpool.output_shape[1:]
dense = model.get_layer(name='fc1')
layer_utils.convert_dense_weights_data_format(dense, shape, 'channels_first')
if K.backend() == 'tensorflow':
warnings.warn('You are using the TensorFlow backend, yet you '
'are using the Theano '
'image data format convention '
'(`image_data_format="channels_first"`). '
'For best performance, set '
'`image_data_format="channels_last"` in '
'your Keras config '
'at ~/.keras/keras.json.')
return model
https://arxiv.org/pdf/1508.06576.pdf was helpful in understanding the different aspects of the mask code.
In terms of the actual masked image, we created our own. Most of the code found to make a mask utilized basic images such as cirlces and rectangles. Also, different code to create a mask was not able to pick up the building well or the dog well (the SMU building background is a little more detailed so that was to be expected).
In the code below, a given input image is encoded in each layer of the CNN by the filter responses to that image. Each layer has a distinct number of filters and that same number of feature maps each of a specific size (this size is the height times the width of the feature map). The responses in a layer can be stored in a matrix where the activation is encoded (at a specific filter, specific position, and specific layer).
In addition to the CNN responses in each layer of the network, there is a style representation that computes the correlations between the different filter responses, where the expectation is taken over the spatial extend of the input image. These feature correlations are given by the Gram matrix. The Gram matrix is used in the style loss function, while the content loss only uses the combination image.
To generate the masking images, the distance is minimized of the masking image from the style background image representation in one layer of the network and the style key image representation in a number of layers of the CNN. The loss function is minimized for the content and style reconstruction. The content representation is matched on layer 'conv4_2' (a deep layer was desired) and the style representations on layers 'conv1_1', 'conv2_1', 'conv3_1', 'conv4_1', and 'conv5_1'.
All of the loss functions are combined into a single scalar and the masked image, style key image, and style background image are combined into a single Keras tensor to be used as input for the VGG16 model. The "style loss" function is designed to maintain the style of the reference image in the generated image, the "auxiliary loss" function is designed to maintain the "content" of the base image in the generated image, and the "total variation loss" function is designed to keep the generated image locally coherent. The loss single scalar function is what is used for the rest of the code.
The code below was pretty well commented to begin with, but we added more comments to help understand what each line of code is doing and to explain what certain functions are doing.
#code comes from https://github.com/GongXinyuu/mask-neural-transfer/blob/master/mask_style_transfer.py
from __future__ import print_function
from keras.preprocessing.image import load_img, img_to_array
from scipy.misc import imsave
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
import time
from keras.utils import plot_model
from keras.applications import vgg16
from keras import backend as K
base_image_path = "/mnt/c/Users/trozz/Desktop/images/deadman.jpg"
mask_path = "/mnt/c/Users/trozz/Desktop/images/deadman_mask.jpg"
style_reference_background_image_path = "/mnt/c/Users/trozz/Desktop/images/mosaic.jpg"
style_reference_key_image_path = "/mnt/c/Users/trozz/Desktop/images/pencil.jpg"
result_prefix = "/mnt/c/Users/trozz/Desktop/images/mask_style_transfer.jpg"
# these are the weights of the different loss components
total_variation_weight = 1 # A larger value may cause blur
style_weight = 100.0
content_weight = 0.0
# dimensions of the generated picture.
width, height = load_img(base_image_path).size
img_nrows = 256
img_ncols = 256
# util function to open, resize and format pictures into appropriate tensors
def preprocess_image(image_path):
img = load_img(image_path, target_size=(img_nrows, img_ncols)) # create instance
img = img_to_array(img) # Convert image instances to tensors
img = np.expand_dims(img, axis=0) # Add a dimension to the first part
img = vgg16.preprocess_input(img) # Zero averaging (subtracting the average value of each channel of the training vgg16 data set)
return img
# util function to convert a tensor into a valid image
#reshaping image
def deprocess_image(x):
if K.image_data_format() == 'channels_first':
x = x.reshape((3, img_nrows, img_ncols))
x = x.transpose((1, 2, 0))
else:
x = x.reshape((img_nrows, img_ncols, 3))
# Remove zero-center by mean pixel
x[:, :, 0] += 103.939
x[:, :, 1] += 116.779
x[:, :, 2] += 123.68
# 'BGR'->'RGB'
x = x[:, :, ::-1]
x = np.clip(x, 0, 255).astype('uint8') # limit the value 0-255
return x
# get tensor representations of our images
base_image = K.variable(preprocess_image(base_image_path)) # Create base for preprocessing image instance
style_reference_background_image = K.variable(preprocess_image(style_reference_background_image_path)) # Create an example of style background preprocessing pictures
style_reference_key_image = K.variable(preprocess_image(style_reference_key_image_path)) # Create an example of style key preprocessing pictures
mask_image = img_to_array(load_img(mask_path, target_size=(img_nrows, img_ncols))) # create mask matrix
# Indexing with a boolean array is also called mask indexing
# This tells us that the colors of the masked image need to be black or white
# The mask key boolean value is greater than 0 because there is color (white) and the multiplication by 1 normalizes it.
# When indexing an image with a key mask, only pixel values at positions where the key mask is located are accessed.
# The mask background boolean value is equal to 0 because there is no color (black) and the multiplication by 1 normalizes it.
# When indexing an image with a background mask, only pixel values at positions where the background mask is located are accessed.
mask_key_bool = (mask_image > 0) * 1.0
mask_background_bool = (mask_image == 0) * 1.0
# this will contain our generated image
if K.image_data_format() == 'channels_first':
combination_image = K.placeholder((1, 3, img_nrows, img_ncols)) # The combination image (masked image, style key image, and style background image) serves as a placeholder
else:
combination_image = K.placeholder((1, img_nrows, img_ncols, 3))
# combine the 3 images into a single Keras tensor
# As a whole input in series, similar to a batch
input_tensor = K.concatenate([combination_image * mask_key_bool,
combination_image * mask_background_bool], axis=0) # input_tensor has 4 dimensions (3 from combination image + 1 from mask bool)
# build the VGG16 network with our 3 images as input
# the model will be loaded with pre-trained ImageNet weights
model = vgg16.VGG16(input_tensor=input_tensor,
weights='imagenet', include_top=False)
print('Model loaded.')
# get the symbolic outputs of each "key" layer (we gave them unique names).
outputs_dict = dict([(layer.name, layer.output) for layer in model.layers])
# compute the neural style loss
# first we need to define 4 util functions
# the gram matrix of an image tensor (feature-wise outer product)
def gram_matrix(x):
# assert K.ndim(x) == 3
if K.image_data_format() == 'channels_first':
features = K.batch_flatten(x) # two-dimensional tensor
else:
features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1)))
gram = K.dot(features, K.transpose(features))
return gram
# the "style loss" is designed to maintain
# the style of the reference image in the generated image.
# It is based on the gram matrices (which capture style) of
# feature maps from the style reference image
# and from the generated image
def style_loss(style, combination):
# assert K.ndim(style) == 3
# assert K.ndim(combination) == 3
S = gram_matrix(style)
C = gram_matrix(combination)
channels = 3
size = img_nrows * img_ncols
return K.sum(K.square(S - C)) / (4. * (channels ** 2) * (size ** 2))
# an auxiliary loss function
# designed to maintain the "content" of the
# base image in the generated image
def content_loss(base, combination):
return K.sum(K.square(combination - base))
# the 3rd loss function, total variation loss,
# designed to keep the generated image locally coherent
def total_variation_loss(x):
assert K.ndim(x) == 4
if K.image_data_format() == 'channels_first':
a = K.square(x[:, :, :img_nrows - 1, :img_ncols - 1] - x[:, :, 1:, :img_ncols - 1]) #subtract misalignment to keep it smooth for columns
b = K.square(x[:, :, :img_nrows - 1, :img_ncols - 1] - x[:, :, :img_nrows - 1, 1:]) #subtract misalignment to keep it smooth for rows
else:
a = K.square(x[:, :img_nrows - 1, :img_ncols - 1, :] - x[:, 1:, :img_ncols - 1, :])
b = K.square(x[:, :img_nrows - 1, :img_ncols - 1, :] - x[:, :img_nrows - 1, 1:, :])
return K.sum(K.pow(a + b, 1.25))
# combine these loss functions into a single scalar
feature_layers = ['block1_conv1', 'block2_conv1',
'block3_conv1', 'block4_conv1',
'block5_conv1']
modelFM = VGG16FM(input_tensor=None,
weights='imagenet', include_top=False)
#runs background features through model
features_back = modelFM.predict(preprocess_image(style_reference_background_image_path))
#runs key features through model
features_key = modelFM.predict(preprocess_image(style_reference_key_image_path))
#runs content features through model
features_content = modelFM.predict(preprocess_image(base_image_path))
#runs each through once
count = 1
loss = K.variable(0.)
layer_features = outputs_dict['block4_conv2']
base_image_features = features_content[4][0]
#combines features of background and key images
combination_features = layer_features[0, :, :, :] + layer_features[1, :, :, :]
#loss is the content weight of the base image multiplied by the content loss of the base image and the background/key images
loss += content_weight * content_loss(base_image_features,
combination_features)
#this is running the standalone features and combination features through the 5 blocks of the VGG model
for layer_name in feature_layers:
layer_features = outputs_dict[layer_name]
style_reference_key_features = features_key[count][0]
style_reference_background_features = features_back[count][0]
combination_key_features = layer_features[0, :, :, :]
combination_background_features = layer_features[1, :, :, :]
sl_key = style_loss(style_reference_key_features, combination_key_features)
sl_background = style_loss(style_reference_background_features, combination_background_features)
loss += (style_weight / len(feature_layers)) * (sl_key + sl_background)
count += 1
#this is the total loss (uses total weights and total loss in the 3 combination image)
loss += total_variation_weight * total_variation_loss(combination_image)
# get the gradients of the generated image wrt the loss
grads = K.gradients(loss, combination_image)
outputs = [loss]
if isinstance(grads, (list, tuple)):
outputs += grads
else:
outputs.append(grads)
f_outputs = K.function([combination_image], outputs) # Output values as Numpy arrays
def eval_loss_and_grads(x):
if K.image_data_format() == 'channels_first':
x = x.reshape((1, 3, img_nrows, img_ncols))
else:
x = x.reshape((1, img_nrows, img_ncols, 3))
outs = f_outputs([x])
loss_value = outs[0]
if len(outs[1:]) == 1:
grad_values = outs[1]
grad_values = grad_values.flatten().astype('float64')
else:
grad_values = np.array(outs[1:])
grad_values = grad_values.flatten().astype('float64')
return loss_value, grad_values
# this Evaluator class makes it possible
# to compute loss and gradients in one pass
# while retrieving them via two separate functions,
# "loss" and "grads". This is done because scipy.optimize
# requires separate functions for loss and gradients,
# but computing them separately would be inefficient.
class Evaluator(object):
def __init__(self):
self.loss_value = None
self.grads_values = None
def loss(self, x):
assert self.loss_value is None
loss_value, grad_values = eval_loss_and_grads(x) # key import
self.loss_value = loss_value
self.grad_values = grad_values
return self.loss_value
def grads(self, x):
assert self.loss_value is not None
grad_values = np.copy(self.grad_values)
self.loss_value = None
self.grad_values = None
return grad_values
evaluator = Evaluator()
x = preprocess_image(base_image_path)
#save loss function
loss_dedman = []
#20 iterations
for i in range(20):
print('Start of iteration', i)
start_time = time.time()
x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x.flatten(),
fprime=evaluator.grads, maxfun=20)
print('Current loss value:', min_val)
loss_dedman.append(min_val)
# save current generated image
img = deprocess_image(x.copy())
fname = result_prefix + '_at_iteration_%d.png' % i
if i % 2 == 0:
imsave(fname, img)
print('Image saved as', fname)
end_time = time.time()
print('Iteration %d completed in %ds' % (i, end_time - start_time))
plt.title('Masked style transfer Dedman Loss')
plt.plot(loss_dedman)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.show()
import matplotlib.pyplot as plt
img1 = plt.imread('/mnt/c/Users/trozz/Desktop/images/mask_style_transfer.jpg_at_iteration_0.png')
img2 = plt.imread('/mnt/c/Users/trozz/Desktop/images/mask_style_transfer.jpg_at_iteration_2.png')
img3 = plt.imread('/mnt/c/Users/trozz/Desktop/images/mask_style_transfer.jpg_at_iteration_6.png')
img4 = plt.imread('/mnt/c/Users/trozz/Desktop/images/mask_style_transfer.jpg_at_iteration_10.png')
img5 = plt.imread('/mnt/c/Users/trozz/Desktop/images/mask_style_transfer.jpg_at_iteration_14.png')
img6 = plt.imread('/mnt/c/Users/trozz/Desktop/images/mask_style_transfer.jpg_at_iteration_18.png')
NUM_ROWS = 2
IMGs_IN_ROW = 3
f, ax = plt.subplots(NUM_ROWS, IMGs_IN_ROW, figsize=(10,10))
ax[0,0].imshow(img1)
ax[0,1].imshow(img2)
ax[0,2].imshow(img3)
ax[1,0].imshow(img4)
ax[1,1].imshow(img5)
ax[1,2].imshow(img6)
ax[0,0].set_title('Mask transfer at iteration 1')
ax[0,1].set_title('Mask transfer at iteration 2')
ax[0,2].set_title('Mask transfer at iteration 6')
ax[1,0].set_title('Mask transfer at iteration 10')
ax[1,1].set_title('Mask transfer at iteration 14')
ax[1,2].set_title('Mask transfer at iteration 18')
title = 'Mask style transfer at different iterations'
f.suptitle(title, fontsize=15)
plt.tight_layout()
plt.show()
from __future__ import print_function
from keras.preprocessing.image import load_img, img_to_array
from scipy.misc import imsave
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
import time
from keras.utils import plot_model
from keras.applications import vgg16
from keras import backend as K
base_image_path = "/mnt/c/Users/trozz/Desktop/images/corgi.jpg"
mask_path = "/mnt/c/Users/trozz/Desktop/images/dog_mask.png"
style_reference_background_image_path = "/mnt/c/Users/trozz/Desktop/images/mosaic.jpg"
style_reference_key_image_path = "/mnt/c/Users/trozz/Desktop/images/pencil.jpg"
result_prefix = "/mnt/c/Users/trozz/Desktop/images/corgi_transfer"
# these are the weights of the different loss components
total_variation_weight = 1#8.5e-5 # A larger value may cause blur
style_weight = 100.0
content_weight = 0.0
# dimensions of the generated picture.
width, height = load_img(base_image_path).size
img_nrows = 256
img_ncols = 256
# get tensor representations of our images
base_image = K.variable(preprocess_image(base_image_path)) # Create base for preprocessing image instance
style_reference_background_image = K.variable(preprocess_image(style_reference_background_image_path)) # Create an example of style background preprocessing pictures
style_reference_key_image = K.variable(preprocess_image(style_reference_key_image_path)) # Create an example of style key preprocessing pictures
mask_image = img_to_array(load_img(mask_path, target_size=(img_nrows, img_ncols))) # create mask matrix
# Indexing with a boolean array is also called mask indexing
# This tells us that the colors of the masked image need to be black or white
# The mask key boolean value is greater than 0 because there is color (white) and the multiplication by 1 normalizes it.
# When indexing an image with a key mask, only pixel values at positions where the key mask is located are accessed.
# The mask background boolean value is equal to 0 because there is no color (black) and the multiplication by 1 normalizes it.
# When indexing an image with a background mask, only pixel values at positions where the background mask is located are accessed.
mask_key_bool = (mask_image > 0) * 1.0
mask_background_bool = (mask_image == 0) * 1.0
# this will contain our generated image
if K.image_data_format() == 'channels_first':
combination_image = K.placeholder((1, 3, img_nrows, img_ncols)) # The combination image (masked image, style key image, and style background image) serves as a placeholder
else:
combination_image = K.placeholder((1, img_nrows, img_ncols, 3))
# combine the 3 images into a single Keras tensor
# As a whole input in series, similar to a batch
input_tensor = K.concatenate([combination_image * mask_key_bool,
combination_image * mask_background_bool], axis=0) # input_tensor has 4 dimensions (3 from combination image + 1 from mask bool)
# build the VGG16 network with our 3 images as input
# the model will be loaded with pre-trained ImageNet weights
model = vgg16.VGG16(input_tensor=input_tensor,
weights='imagenet', include_top=False)
print('Model loaded.')
# get the symbolic outputs of each "key" layer (we gave them unique names).
outputs_dict = dict([(layer.name, layer.output) for layer in model.layers])
feature_layers = ['block1_conv1', 'block2_conv1',
'block3_conv1', 'block4_conv1',
'block5_conv1']
modelFM = VGG16FM(input_tensor=None,
weights='imagenet', include_top=False)
features_back = modelFM.predict(preprocess_image(style_reference_background_image_path))
features_key = modelFM.predict(preprocess_image(style_reference_key_image_path))
features_content = modelFM.predict(preprocess_image(base_image_path))
count = 1
loss = K.variable(0.)
layer_features = outputs_dict['block4_conv2']
base_image_features = features_content[4][0]
combination_features = layer_features[0, :, :, :] + layer_features[1, :, :, :]
loss += content_weight * content_loss(base_image_features,
combination_features)
for layer_name in feature_layers:
layer_features = outputs_dict[layer_name]
style_reference_key_features = features_key[count][0]
style_reference_background_features = features_back[count][0]
combination_key_features = layer_features[0, :, :, :]
combination_background_features = layer_features[1, :, :, :]
sl_key = style_loss(style_reference_key_features, combination_key_features)
sl_background = style_loss(style_reference_background_features, combination_background_features)
loss += (style_weight / len(feature_layers)) * (sl_key + sl_background)
count += 1
loss += total_variation_weight * total_variation_loss(combination_image)
# get the gradients of the generated image wrt the loss
grads = K.gradients(loss, combination_image)
outputs = [loss]
if isinstance(grads, (list, tuple)):
outputs += grads
else:
outputs.append(grads)
f_outputs = K.function([combination_image], outputs) # Output values as Numpy arrays
def eval_loss_and_grads(x):
if K.image_data_format() == 'channels_first':
x = x.reshape((1, 3, img_nrows, img_ncols))
else:
x = x.reshape((1, img_nrows, img_ncols, 3))
outs = f_outputs([x])
loss_value = outs[0]
if len(outs[1:]) == 1:
grad_values = outs[1]
grad_values = grad_values.flatten().astype('float64')
else:
grad_values = np.array(outs[1:])
grad_values = grad_values.flatten().astype('float64')
return loss_value, grad_values
evaluator = Evaluator()
x = preprocess_image(base_image_path)
loss_dog = []
for i in range(20):
print('Start of iteration', i)
start_time = time.time()
x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x.flatten(),
fprime=evaluator.grads, maxfun=20)
print('Current loss value:', min_val)
loss_dog.append(min_val)
# save current generated image
img = deprocess_image(x.copy())
fname = result_prefix + '_at_iteration_%d.png' % i
if i % 2 == 0:
imsave(fname, img)
print('Image saved as', fname)
end_time = time.time()
print('Iteration %d completed in %ds' % (i, end_time - start_time))
plt.title('Masked style transfer Corgi Loss')
plt.plot(loss_dog)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.show()
import matplotlib.pyplot as plt
img1 = plt.imread('/mnt/c/Users/trozz/Desktop/images/corgi_transfer_at_iteration_0.png')
img2 = plt.imread('/mnt/c/Users/trozz/Desktop/images/corgi_transfer_at_iteration_4.png')
img3 = plt.imread('/mnt/c/Users/trozz/Desktop/images/corgi_transfer_at_iteration_6.png')
img4 = plt.imread('/mnt/c/Users/trozz/Desktop/images/corgi_transfer_at_iteration_10.png')
img5 = plt.imread('/mnt/c/Users/trozz/Desktop/images/corgi_transfer_at_iteration_14.png')
img6 = plt.imread('/mnt/c/Users/trozz/Desktop/images/corgi_transfer_at_iteration_18.png')
NUM_ROWS = 2
IMGs_IN_ROW = 3
f, ax = plt.subplots(NUM_ROWS, IMGs_IN_ROW, figsize=(10, 10))
ax[0,0].imshow(img1)
ax[0,1].imshow(img2)
ax[0,2].imshow(img3)
ax[1,0].imshow(img4)
ax[1,1].imshow(img5)
ax[1,2].imshow(img6)
ax[0,0].set_title('Mask transfer at iteration 1')
ax[0,1].set_title('Mask transfer at iteration 2')
ax[0,2].set_title('Mask transfer at iteration 6')
ax[1,0].set_title('Mask transfer at iteration 10')
ax[1,1].set_title('Mask transfer at iteration 14')
ax[1,2].set_title('Mask transfer at iteration 18')
title = 'Mask style transfer at different iterations'
f.suptitle(title, fontsize=15)
plt.tight_layout()
plt.show()
From the Dallas Hall and corgi examples using different styles for each mask, we noticed a visually satifactory style transfer. The losses showed convergence after around 5 interations of l-bfgs-b minimization of the combined loss (style and content). We noticed that differently from our encoder-decoder, where the WCT retained (at least partially) the color of the content image, here the color of the styled transfered images is completely dominated by the style images. This tells us that the grammiam matrix is responsible for the color information.
P.S: the pencil corgi with the mosaic in the background is very cute.